library(tidyverse)
library(magrittr)
library(limma)
library(edgeR)
library(ggfortify)
library(ngsReports)
library(AnnotationHub)
library(readxl)
library(scales)
library(ggpubr)
library(cowplot)
library(ggrepel)
library(msigdbr)
library(pheatmap)
library(ggeasy)
library(fgsea)
library(harmonicmeanp)
library(goseq)
library(pathview)
library(RColorBrewer)
library(UpSetR)
library(pander)
library(kableExtra)
theme_set(theme_bw())
The following analysis will compare the effect of heterozygosity for an early-onset familial Alzheimer’s disease-like (EOfAD-like) mutation, or a familial Hidradenitis supurativa (fHS-like) mutation in the presenilin 1 (PSEN1) gene on the brain transcriptome of zebrafish at 6 months of age. A family of zebrafish were generated, which contained fish with psen1 genotypes: EOfAD-like/+, fHS-like/+, +/+ and EOfAD-like/fHS-like (transheterozygous). These fish were sacrificed at 6 months of age, their brains removed and RNA was extracted for RNA-seq. Genomic DNA was extracted from tails of these fish for genotyping by PCR, while sex was determined by inspection of body shape and innards. n = 4 fish per genotype and sex were actually used in the RNA seq experiment.
mRNA was prepared for RNA-seq by SAHMRI. polyA+, stranded libraries were generated. Also, uniique molecular identifers (UMIs) were added to the RNA moleculaes to reduce noise due to PCR duplicates. These UMIs were added to each read using fastp. Then, reads were aligned to the zebrafish genome (GRCz11, Ensembl release 98) using STAR (v2.7.0d). Gene expression values were counted using featureCounts.
meta <-
read_excel("meta_fishInfo_2.xlsx") %>%
dplyr::filter(!is.na(Position_plate_seq)) %>%
dplyr::select(fish_id, batch_killed, Genotype, Sex, RINe, Position_plate_seq) %>%
left_join(read_delim("featureCounts_umidedup_release98.txt",
delim = "\t", skip = 1) %>%
set_names(basename(names(.))) %>%
as.data.frame() %>%
colnames() %>%
.[grepl(., pattern = "KB")] %>%
as_tibble() %>%
set_colnames("bamName") %>%
mutate(Position_plate_seq = str_extract(string = bamName,
pattern = '[:alpha:]{1}[:digit:]+')) ) %>%
dplyr::filter(!is.na(bamName)) %>%
mutate(Genotype = str_replace(Genotype,
pattern = "AI-",
replacement = "fAI-"),
sample = paste0(Genotype, "_", Sex, "_",fish_id),
Genotype = factor(Genotype, levels = c("WT", "EOfAD-like/+", "fAI-like/+")),
Genotype_2 = case_when(
Genotype == "EOfAD-like/+" ~ "T428del/+",
Genotype == "fAI-like/+" ~ "W233fs/+",
TRUE ~ "WT"
),
Genotype_2 = factor(Genotype_2, levels = c("WT", "T428del/+", "W233fs/+")))
meta %>%
dplyr::select(fish_id, Genotype, Sex, RINe) %>%
kable(caption = "Sample info") %>%
kableExtra::kable_styling("basic", full_width = F)
| fish_id | Genotype | Sex | RINe |
|---|---|---|---|
| 4 | WT | F | 9.4 |
| 9 | EOfAD-like/+ | M | 9.0 |
| 19 | WT | F | 9.1 |
| 25 | WT | F | 9.4 |
| 12 | fAI-like/+ | F | 8.7 |
| 46 | EOfAD-like/+ | F | 9.2 |
| 21 | fAI-like/+ | M | 9.4 |
| 50 | EOfAD-like/+ | M | 9.3 |
| 55 | WT | M | 9.1 |
| 22 | fAI-like/+ | F | 9.3 |
| 54 | WT | M | 9.3 |
| 26 | fAI-like/+ | M | 9.4 |
| 27 | EOfAD-like/+ | F | 9.3 |
| 34 | WT | M | 9.0 |
| 47 | fAI-like/+ | F | 9.2 |
| 29 | EOfAD-like/+ | M | 9.5 |
| 35 | WT | M | 9.3 |
| 49 | fAI-like/+ | M | 9.5 |
| 30 | EOfAD-like/+ | F | 9.6 |
| 36 | WT | F | 9.7 |
| 44 | fAI-like/+ | M | 9.8 |
| 37 | WT | M | 9.3 |
| 45 | fAI-like/+ | F | 9.4 |
| 40 | EOfAD-like/+ | F | 9.4 |
ah <- AnnotationHub() %>%
subset(species == "Danio rerio") %>%
subset(rdataclass == "EnsDb")
ensDb <- ah[["AH74989"]] # for release 98
grTrans <- transcripts(ensDb)
trLengths <- exonsBy(ensDb, "tx") %>%
width() %>%
vapply(sum, integer(1))
mcols(grTrans)$length <- trLengths[names(grTrans)]
gcGene <- grTrans %>%
mcols() %>%
as.data.frame() %>%
dplyr::select(gene_id, tx_id, gc_content, length) %>%
as_tibble() %>%
group_by(gene_id) %>%
summarise(
gc_content = sum(gc_content*length) / sum(length),
length = ceiling(median(length))
)
grGenes <- genes(ensDb)
mcols(grGenes) %<>%
as.data.frame() %>%
left_join(gcGene) %>%
as.data.frame() %>%
DataFrame()
fastqc_raw <- list.files(
path = "fastqc_raw/",
pattern = "zip",
recursive = TRUE,
full.names = TRUE
) %>%
FastqcDataList()
Note that the duplicated number of reads was minimised as we have deduplicaated PCR duplicates from the Unique Molecular Identifiers (UMIs).
plotReadTotals(fastqc_raw)
Library Sizes for each paired end read for each sample before any processing was undertaken.
GC content was also assessed. The distribution in the plot below is similar to distributions we have observed in previous polyA+ RNA-seq experiments.
plotGcContent(
x = fastqc_raw,
plotType = "line",
gcType = "Transcriptome",
species = "Drerio"
) +
theme(legend.position = "none")
Genes which are lowly expressed are uninformative for DE analysis. I will follow the 10/min.lib.size rule to filter lowly expressed genes. The effect of filtering is be shown in the density plots below.
minCPM <- 0.1
minSamples <- 12
featureCounts <-
read_delim("featureCounts_umidedup_release98.txt", delim = "\t", skip = 1) %>%
set_names(basename(names(.))) %>%
as.data.frame() %>%
dplyr::select(-c(Chr, Start, End, Length, Strand)) %>%
gather(key = "bamName", value = "counts", contains("KB")) %>%
left_join(meta) %>%
dplyr::select(Geneid, counts, sample) %>%
spread(key = "sample", value = "counts") %>%
column_to_rownames("Geneid")
a <- featureCounts %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
pivot_longer(
cols = everything(),
names_to = "sample",
values_to = "logCPM"
) %>%
split(f = .$sample) %>%
lapply(function(x){
d <- density(x$logCPM)
tibble(
sample = unique(x$sample),
x = d$x,
y = d$y
)
}) %>%
bind_rows() %>%
left_join(meta) %>%
ggplot(aes(x, y, colour = Genotype, group = sample)) +
geom_line() +
labs(
x = "logCPM",
y = "Density",
colour = "Genotype"
)+
ggtitle("Before filtering") +
theme_bw()
b <- featureCounts %>%
.[rowSums(cpm(.) >= 1) >= minSamples,] %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
pivot_longer(
cols = everything(),
names_to = "sample",
values_to = "logCPM"
) %>%
split(f = .$sample) %>%
lapply(function(x){
d <- density(x$logCPM)
tibble(
sample = unique(x$sample),
x = d$x,
y = d$y
)
}) %>%
bind_rows() %>%
left_join(meta) %>%
ggplot(aes(x, y, colour = Genotype, group = sample)) +
geom_line() +
labs(
x = "logCPM",
y = "Density",
colour = "Genotype"
)+
ggtitle("After filtering") +
theme_bw()
ggarrange(a, b, common.legend = TRUE)
x <- featureCounts %>%
as.matrix() %>%
.[rowSums(cpm(.) >= minCPM) >= minSamples,] %>%
DGEList(
samples = tibble(sample = colnames(.)) %>%
left_join(meta),
genes = grGenes[rownames(.)] %>%
as.data.frame() %>%
dplyr::select(
chromosome = seqnames, start, end,
gene_id, gene_name, gene_biotype, description, entrezid
) %>%
left_join(gcGene) %>%
as_tibble()
) %>%
calcNormFactors()
Gene-level count data as output by featureCounts, was loaded and formed into a DGEList object. During this process, genes were removed if they were not considered as detectable (CPM < 0.1 in > 12 samples). This translates to > 7 reads assigned a gene in at least half of the RNA seq samples..
These filtering steps returned gene-level counts for 22,005 genes, with total library sizes between 69,268,551, 109,940,513 reads assigned to genes.
For the purposes of genotype checking, the umi-dedup .bam files were converted back to paired fastq files, then were pseudo-aligned to a modified version of the zebrafish trannscriptome (GRCz11, cDNA , primary assembly). This transcritome manually included the psen1 transcripts with the mutations.
transcounts <- list.files("kallisto/", full.names = TRUE) %>%
catchKallisto()
## Reading kallisto//1_KB_A10_HTVLLDRXX_AACTCGGA-GTCTTGCA_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//10_KB_B6_HTVLLDRXX_GCACACAA-GATTGCTC_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//11_KB_B7_HTVLLDRXX_CTCCTAGT-AGATGAGG_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//12_KB_B8_HTVLLDRXX_TCTTCGAC-GAGCAGTA_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//13_KB_B9_HTVLLDRXX_GACTACGA-GTCGGTAA_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//14_KB_C10_HTVLLDRXX_CCAACACT-AGTTCGTC_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//15_KB_C11_HTVLLDRXX_TCTAGGAG-CGTTGCAA_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//16_KB_C12_HTVLLDRXX_CTCGAACA-TTCGTTGG_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//17_KB_C1_HTVLLDRXX_ACCATCCT-GTTCATGG_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//18_KB_C2_HTVLLDRXX_CGTCCATT-AGGAGGAA_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//19_KB_C3_HTVLLDRXX_AACTTGCC-GAAGTTGG_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//2_KB_A11_HTVLLDRXX_GTCGAGAA-GCTGTAAG_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//20_KB_C4_HTVLLDRXX_GTACACCT-TTGGTCTC_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//21_KB_C5_HTVLLDRXX_ACGAGAAC-CGAACTGT_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//22_KB_C6_HTVLLDRXX_CGACCTAA-CAGGTTAG_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//23_KB_C7_HTVLLDRXX_TACATCGG-TGATCGGA_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//24_KB_C8_HTVLLDRXX_ATCGTCTC-GTCCTTCT_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//3_KB_A1_HTVLLDRXX_CGCTACAT-CGTAGGTT_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//4_KB_A6_HTVLLDRXX_AATCCAGC-TAGGATGC_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//5_KB_A7_HTVLLDRXX_CGTCTAAC-ACTCGTTG_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//6_KB_B11_HTVLLDRXX_ACTCCTAC-AGTTACGG_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//7_KB_B12_HTVLLDRXX_CTTCCTTC-ACCTGGAA_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//8_KB_B3_HTVLLDRXX_ACAACAGC-TTGTCGGT_merged.umidup, 51752 transcripts, 100 bootstraps
## Reading kallisto//9_KB_B5_HTVLLDRXX_ATGACAGG-TGGCATGT_merged.umidup, 51752 transcripts, 100 bootstraps
colnames(transcounts$counts) %<>%
basename() %>%
str_extract(pattern = '[:alpha:]{1}[:digit:]+')
psen1_txids <- grTrans %>%
as_tibble() %>%
dplyr::filter(gene_id == "ENSDARG00000004870") %>%
.$tx_id_version
psen1_txids %<>%
sapply(function(x) {
x %>%
paste0("_W233fs")
}) %>%
unlist() %>%
as_tibble() %>%
bind_rows(psen1_txids %>%
sapply(function(x) {
x %>%
paste0("_T428del")
}) %>%
unlist() %>%
as_tibble()
) %>%
bind_rows(psen1_txids %>%
as_tibble)
transcounts$counts %>%
cpm() %>%
as.data.frame() %>%
rownames_to_column("tx_id") %>%
as_tibble() %>%
dplyr::filter(tx_id %in% psen1_txids$value) %>%
gather(key = "Position_plate_seq", value = "CPM", !starts_with("tx")) %>%
mutate(psen1 = case_when(
grepl(tx_id, pattern = "T428del") ~ "T428del",
grepl(tx_id, pattern = "W233fs") ~ "W233fs",
TRUE ~ "WT"
)) %>%
group_by(Position_plate_seq, psen1) %>%
summarise(CPM = sum(CPM)) %>%
left_join(meta) %>%
ggplot(aes(Position_plate_seq, CPM, fill = psen1)) +
geom_bar(stat = "identity", colour = "black") +
scale_fill_viridis_d()+
theme_bw() +
facet_wrap(~Genotype, scales = "free_x")
This didnt really work. So I manually inspected the alignments using IGV and counted the number of reads aligning to the WT and mutant sequences at each site. For the T428 site, I took the average number of reads for the 3 codons, for the W233 site, it was the avergae number of reads for the 2 codons.
psen1_gene_id <- "ENSDARG00000004870"
ggarrange(
cpm(x, log = TRUE) %>%
as.data.frame() %>%
.[psen1_gene_id,] %>%
gather(key = "sample", value = "logCPM") %>%
left_join(x$samples) %>%
ggplot(aes(x = Genotype, y = logCPM, fill = Genotype)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(aes(shape = Sex),
size = 3, colour = "black") +
theme_bw() +
easy_rotate_x_labels(angle = -45) +
ggtitle("psen1 expression") +
labs(x = "Genotype",
fill = "Genotype"),
ggarrange(
read_excel("numreads_psen1Sites.xlsx") %>%
left_join(x$samples) %>%
mutate(total_number_reads_T428del = T428_wt + T428del_reads,
total_number_reads_W233fs = W233_wt + W233fs_reads) %>%
ggplot(aes(x = sample, y = total_number_reads_T428del)) +
geom_col(fill = "#350463") +
geom_col(aes(y = T428del_reads), fill = "#1F968CFF") +
facet_wrap(~Genotype, scales = "free_x") +
theme_bw() +
labs(x = "Sample",
y = "Number of reads") +
easy_rotate_x_labels(angle = -90) +
ggtitle("Number of reads aligning to the T428 site"),
read_excel("numreads_psen1Sites.xlsx") %>%
left_join(x$samples) %>%
mutate(total_number_reads_T428del = T428_wt + T428del_reads,
total_number_reads_W233fs = W233_wt + W233fs_reads) %>%
ggplot(aes(x = sample, y = total_number_reads_W233fs)) +
geom_col(fill = "#350463") +
geom_col(aes(y = W233fs_reads), fill = "#FDE725FF") +
facet_wrap(~Genotype, scales = "free_x") +
theme_bw() +
labs(x = "Sample",
y = "Number of reads") +
easy_rotate_x_labels(angle = -90) +
ggtitle("Number of reads aligning to the W233 site"),
labels = c("B", "C"),
ncol = 1,
nrow = 2),
common.legend = F,
labels = c("A", "")
)
read_excel("numreads_psen1Sites.xlsx") %>%
left_join(x$samples) %>%
gather(key = "allele", value = "numReads", 2:5) %>%
mutate(scaled_reads = numReads*norm.factors,
half_scaled_reads = scaled_reads/2) %>%
dplyr::filter(allele %in% c("W233_wt")) %>%
ggplot(aes(x = Genotype)) +
geom_boxplot(aes(fill = Genotype, y = scaled_reads),
outlier.shape = NA) +
geom_point(aes(shape = Sex, y = scaled_reads),
size= 4,
position = position_jitter(seed = 1, height = 0, width = 0.2)) +
# add the secod layer of points
geom_boxplot(aes(fill = Genotype, y = half_scaled_reads),
outlier.shape = NA,
alpha = 0.5,
data = . %>% dplyr::filter(Genotype == "WT")) +
geom_point(aes(shape = Sex, y = half_scaled_reads),
size= 4,
position = position_jitter(seed = 1, height = 0, width = 0.2),
alpha = 0.5,
data = . %>% dplyr::filter(Genotype == "WT")
) +
stat_summary(aes(y = half_scaled_reads),
data = . %>% dplyr::filter(Genotype == "WT"),
fun = "mean", geom = "point", shape= 23, size= 3, alpha = 0.5, fill= "red") +
stat_summary(aes( y = scaled_reads),
fun = "mean", geom = "point", shape= 23, size= 3, fill= "red") +
theme_bw() +
scale_y_continuous(limits = c(20, 180)) +
labs(y = "Number of reads aligning to the wild type W233 site\n(scaled by RNA-seq library size)") +
ggtitle("Number of reads aligning to the WT W233 site")
# t.test(x = read_excel("numreads_psen1Sites.xlsx") %>%
# left_join(x$samples) %>%
# gather(key = "allele", value = "numReads", 2:5) %>%
# mutate(scaled_reads = numReads*norm.factors) %>%
# dplyr::filter(allele %in% c("W233_wt"),
# Genotype == "fAI-like/+") %>%
# .$scaled_reads,
# y = read_excel("numreads_psen1Sites.xlsx") %>%
# left_join(x$samples) %>%
# gather(key = "allele", value = "numReads", 2:5) %>%
# mutate(scaled_reads = numReads*norm.factors) %>%
# dplyr::filter(allele %in% c("W233_wt"),
# Genotype == "WT") %>%
# .$scaled_reads/2
# )
First, I will explore the overall similarity between samples using principal component analysis. The first two principal components explained less of the total variability than one would expxect. No clear seperatation between samples is observed of Genotype or Sex.
ggarrange(
x$counts %>%
cpm(log=TRUE) %>%
t() %>%
prcomp() %>%
autoplot(data = tibble(sample = rownames(.$x)) %>%
left_join(x$samples),
colour = "Genotype",
shape = "Sex",
size = 6
) +
theme_bw() +
theme(aspect.ratio = 1) +
labs(colour = "Genotype") ,
x$counts %>%
cpm(log=TRUE) %>%
t() %>%
prcomp() %>%
summary() %>%
.$importance %>%
as.data.frame() %>%
t() %>%
as.data.frame() %>%
rownames_to_column("PC") %>%
mutate(`Proportion of Variance` = `Proportion of Variance`*100,
`Cumulative Proportion` = `Cumulative Proportion`*100) %>%
ggplot(aes(x = reorder(PC, -`Proportion of Variance`))) +
geom_col(aes( y = `Proportion of Variance`)) +
labs(y = "Proportion of Variance (%)",
x = "Principal Component") +
geom_point(aes(y = `Cumulative Proportion`))+
geom_line(aes(y = `Cumulative Proportion`, group = 1), show.legend = TRUE) +
theme_bw() +
theme(aspect.ratio = 1) +
rotate_x_text(angle = -45),
common.legend = FALSE,
labels = "AUTO"
)
# ggsave("plots/PCA_scree.png", width = 18, height = 10, units = "cm", dpi = 800, scale = 1.5)
I also want to see if any technical artefacts correlate with PC1 or PC2 (i.e. RNA-seq library size or RNA quality). No observed correlations are presnt.
ggarrange(
x$counts %>%
cpm(log = TRUE) %>%
t() %>%
prcomp %>%
.$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(x$samples) %>%
mutate(`Library Size (millions) ` = lib.size/1e6) %>%
ggplot(aes(x = PC1, y = `Library Size (millions) `)) +
geom_point(aes( colour = Genotype, shape = Sex),
size = 6) +
geom_smooth(method = "lm") +
labs(colour = "Genotype") +
theme_bw(),
x$counts %>%
cpm(log = TRUE) %>%
t() %>%
prcomp %>%
.$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(x$samples) %>%
mutate(`Library Size (millions) ` = lib.size/1e6) %>%
ggplot(aes(x = PC2, y = `Library Size (millions) `)) +
geom_point(aes( colour = Genotype, shape = Sex),
size = 6) +
geom_smooth(method = "lm") +
labs(colour = "Genotype") +
theme_bw(),
common.legend = TRUE,
labels = "AUTO"
)
#ggsave("plots/PC1vLibsize.png", height = 4)
ggarrange(
x$counts %>%
cpm(log = TRUE) %>%
t() %>%
prcomp %>%
.$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(x$samples) %>%
ggplot(aes(x = PC1, y = RINe)) +
geom_point(aes( colour = Genotype, shape = Sex),
size = 6) +
geom_smooth(method = "lm") +
theme_bw(),
x$counts %>%
cpm(log = TRUE) %>%
t() %>%
prcomp %>%
.$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(x$samples) %>%
ggplot(aes(x = PC2, y = RINe)) +
geom_point(aes( colour = Genotype, shape = Sex),
size = 6) +
geom_smooth(method = "lm") +
theme_bw(),
common.legend = T
)
design <- model.matrix(~Genotype, data = x$samples) %>%
set_colnames(str_remove(colnames(.), pattern = "Genotype"))
design %>%
as.data.frame() %>%
pheatmap(cluster_rows = FALSE, cluster_cols = FALSE,
color = brewer_pal(palette = "Set1")(2),
cellheight = 15, cellwidth = 15)
Visualisation of design matrix
fit_1 <- x %>%
estimateDisp(design) %>%
glmFit(design)
toptable_1 <- design %>% colnames() %>% .[2:3] %>%
sapply(function(x){
glmLRT(fit_1, coef = x) %>%
topTags(n = Inf) %>%
.[["table"]] %>%
as_tibble() %>%
arrange(PValue) %>%
mutate(
coef = x,
DE = FDR < 0.05
) %>%
dplyr::select(
gene_name, logFC, logCPM, PValue, FDR, DE, everything()
)
}, simplify = FALSE)
toptable_1 %>%
lapply(function(x) {
x %>%
dplyr::filter(DE == TRUE)
}) %>%
bind_rows() %>%
dplyr::select(coef, gene_name, logFC, logCPM, PValue, FDR, DE) %>%
kable(caption = "DE genes due to each mutation relative to WT") %>%
kable_styling()
| coef | gene_name | logFC | logCPM | PValue | FDR | DE |
|---|---|---|---|---|---|---|
| EOfAD-like/+ | col12a1a | 1.8959094 | 3.8394694 | 0.00e+00 | 0.0000000 | TRUE |
| EOfAD-like/+ | si:ch211-235i11.5 | 0.6904772 | 2.8948813 | 0.00e+00 | 0.0000000 | TRUE |
| EOfAD-like/+ | ano9b | 1.2713444 | -1.1769775 | 0.00e+00 | 0.0000246 | TRUE |
| EOfAD-like/+ | si:ch211-235i11.4 | 0.3423547 | 3.5708023 | 9.00e-07 | 0.0039792 | TRUE |
| EOfAD-like/+ | CR318588.1 | 3.3926411 | 2.4513599 | 1.00e-06 | 0.0039792 | TRUE |
| EOfAD-like/+ | myhb | 4.1004025 | 2.3851026 | 1.10e-06 | 0.0039792 | TRUE |
| EOfAD-like/+ | si:ch211-235i11.6 | 0.5029123 | 1.8847032 | 1.70e-06 | 0.0050857 | TRUE |
| EOfAD-like/+ | adi1 | 0.5302238 | 2.7029554 | 1.80e-06 | 0.0050857 | TRUE |
| EOfAD-like/+ | CR318588.3 | 2.0624035 | 2.7489762 | 3.00e-06 | 0.0073193 | TRUE |
| EOfAD-like/+ | CABZ01068209.1 | -0.9723557 | -0.9288210 | 9.80e-06 | 0.0216029 | TRUE |
| EOfAD-like/+ | mov10b.2 | 1.3400381 | -1.8722367 | 1.57e-05 | 0.0306411 | TRUE |
| EOfAD-like/+ | znf982 | 0.3808937 | 2.2570512 | 1.67e-05 | 0.0306411 | TRUE |
| EOfAD-like/+ | stac3 | 1.5755831 | -2.4135388 | 2.66e-05 | 0.0450699 | TRUE |
| fAI-like/+ | psen1 | -0.7992024 | 5.3419313 | 0.00e+00 | 0.0000000 | TRUE |
| fAI-like/+ | dglucy | -0.9591401 | 5.0136276 | 0.00e+00 | 0.0000316 | TRUE |
| fAI-like/+ | sh3pxd2ab | -2.9861437 | 1.0043260 | 0.00e+00 | 0.0002255 | TRUE |
| fAI-like/+ | fam167aa | -3.1575730 | 0.3483399 | 1.00e-07 | 0.0008048 | TRUE |
| fAI-like/+ | kcnk17 | -1.3778914 | -1.1065158 | 6.00e-06 | 0.0264798 | TRUE |
toptable_1 %>%
bind_rows() %>%
ggplot(aes(x = logCPM, y = logFC, colour = DE)) +
geom_point(
alpha = 0.5
) +
facet_grid(rows = vars(coef)) +
theme_bw() +
geom_label_repel(
aes(label = gene_name),
data = . %>% dplyr::filter(FDR < 0.05),
show.legend = FALSE
) +
theme(legend.position = "none") +
ggtitle("MD Plots of different psen1 mutant comparisons to WT with TMM normalisation") +
scale_color_manual(values = c("grey50", "red"))
toptable_1 %>%
bind_rows() %>%
ggplot(aes(y = -log10(PValue), x = logFC, colour = DE)) +
geom_point(
alpha = 0.5
) +
facet_grid(rows = vars(coef)) +
theme_bw() +
geom_label_repel(
aes(label = gene_name),
data = . %>% dplyr::filter(FDR < 0.05),
show.legend = FALSE
) +
theme(legend.position = "none") +
ggtitle("Volcano Plots of different psen1 mutant comparisons to WT with TMM normalisation") +
coord_cartesian(ylim = c(0, 10)) +
scale_color_manual(values = c("grey50", "red"))
A ranking statistic of the sign of the logFC multiplied by the log10 of the p-value was calculated for each gene. I plotted this ranking statistic against the gc content and length of each gene to see if there are any biases observed. If there was, I would use the cqn normalisation method to correct for this. No apparent biases were observed, therefore cqn is not appropriate.
Note that the limits for the y axis (ranking stat) are zoomed in for visualisation purposes.
ggarrange(
toptable_1 %>%
bind_rows() %>%
mutate(rankstat = sign(logFC)*-log10(PValue)) %>%
ggplot(aes(x = length, y = rankstat)) +
geom_point(
aes(colour = DE),
alpha = 0.5
) +
geom_smooth(se = FALSE, method = "gam") +
facet_grid(rows = vars(coef)) +
theme_bw() +
theme(legend.position = "none") +
scale_color_manual(values = c("grey50", "red")) +
scale_x_log10()+
labs(x = "Average transcript length per gene",
colour = "Differentially expressed?",
y = "sign(logFC)*-log10(PValue)") +
coord_cartesian(ylim = c(-10, 10)),
toptable_1 %>%
bind_rows() %>%
mutate(rankstat = sign(logFC)*-log10(PValue)) %>%
ggplot(aes(x = gc_content, y = rankstat)) +
geom_point(
aes(colour = DE),
alpha = 0.5
) +
geom_smooth(se = FALSE, method = "gam") +
facet_grid(rows = vars(coef)) +
theme_bw() +
theme(legend.position = "none") +
scale_color_manual(values = c("grey50", "red")) +
coord_cartesian(ylim = c(-10,10)) +
labs(x = "Weighted average GC content (%) per gene",
colour = "Differentially expressed?",
y = "sign(logFC)*-log10(PValue)"),
common.legend = TRUE,
labels = "AUTO"
)
# ggsave("plots/gclength.png", width = 18, height = 10, units = "cm", dpi = 600)
# GO terms
goSummaries <- url("https://uofabioinformaticshub.github.io/summaries2GO/data/goSummaries.RDS") %>%
readRDS() %>%
mutate(
Term = Term(id),
gs_name = Term %>% str_to_upper() %>% str_replace_all("[ -]", "_"),
gs_name = paste0("GO_", gs_name)
)
minPath <- 3
# get a mapping file of entrez to ensembl ids
ens2Entrez <- grGenes %>%
as_tibble() %>%
dplyr::select(gene_id, entrezid) %>%
unchop(entrezid) %>%
dplyr::filter(gene_id %in% rownames(x)) %>%
dplyr::filter(!is.na(entrezid))
GO <- msigdbr("Danio rerio", category = "C5") %>%
dplyr::filter(grepl(gs_name, pattern = "^GO_")) %>%
dplyr::rename(entrezid = entrez_gene) %>%
inner_join(ens2Entrez) %>%
dplyr::distinct(gs_name, gene_id, .keep_all = TRUE) %>%
split(f = .$gs_name) %>%
lapply(extract2, "gene_id")
KEGG <- msigdbr("Danio rerio", category = "C2", subcategory = "CP:KEGG") %>%
left_join(x$genes, by = c("gene_symbol" = "gene_name")) %>%
dplyr::filter(gene_id %in% rownames(x)) %>%
distinct(gs_name, gene_id, .keep_all = TRUE) %>%
split(f = .$gs_name) %>%
lapply(extract2, "gene_id")
ireGenes <- readRDS("zebrafishIreGenes.rds") %>%
unlist() %>%
as.data.frame() %>%
rownames_to_column("ire") %>%
as_tibble() %>%
mutate(ire = str_extract(ire, pattern = "ire[:digit:]_[:alpha:]+")) %>%
set_colnames(c("ire", "gene_id")) %>%
dplyr::filter(gene_id %in% rownames(x)) %>%
split(f = .$ire) %>%
lapply(magrittr::extract2,"gene_id")
chr <- x$genes %>%
dplyr::select(gene_id, chromosome) %>%
split(f = .$chromosome) %>%
lapply(extract2, "gene_id") %>%
.[c("MT", seq(1:25))] # omit the ALT chromosmes and scaffolds
To test for enrichment of GO terms within the DE gene, I used goseq. I used the average transcript length per gene to calculate the PWF.
pwfs <- toptable_1 %>%
lapply(function(x) {
x %>%
as_tibble() %>%
distinct(gene_name, .keep_all = TRUE)
}) %>%
lapply(function(x) {
x %>%
with(
nullp(
DEgenes = structure(DE, names = gene_id),
bias.data = length
)
)
})
goseq(pwfs$`EOfAD-like/+`, gene2cat = c(GO)) %>%
as_tibble() %>%
mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr")) %>%
dplyr::select(-under_represented_pvalue) %>%
head(10) %>%
kable(caption = "Top 10 overrep GO terms in EOfAD-like/+ brains") %>%
kable_styling()
| category | over_represented_pvalue | numDEInCat | numInCat | FDR |
|---|---|---|---|---|
| GO_SKELETAL_MUSCLE_CONTRACTION | 0.0000383 | 2 | 32 | 0.3924105 |
| GO_MULTICELLULAR_ORGANISMAL_MOVEMENT | 0.0000774 | 2 | 44 | 0.3963241 |
| GO_STRIATED_MUSCLE_CONTRACTION | 0.0008309 | 2 | 147 | 1.0000000 |
| GO_L_METHIONINE_SALVAGE_FROM_METHYLTHIOADENOSINE | 0.0014209 | 1 | 5 | 1.0000000 |
| GO_CALCIUM_ACTIVATED_PHOSPHOLIPID_SCRAMBLING | 0.0014353 | 1 | 4 | 1.0000000 |
| GO_AMINO_ACID_SALVAGE | 0.0017122 | 1 | 6 | 1.0000000 |
| GO_REGULATION_OF_INORGANIC_ANION_TRANSMEMBRANE_TRANSPORT | 0.0021386 | 1 | 6 | 1.0000000 |
| GO_MYOSIN_LIGHT_CHAIN_BINDING | 0.0022972 | 1 | 7 | 1.0000000 |
| GO_PHOSPHOLIPID_SCRAMBLASE_ACTIVITY | 0.0023459 | 1 | 7 | 1.0000000 |
| GO_REGULATION_OF_ANION_CHANNEL_ACTIVITY | 0.0023529 | 1 | 7 | 1.0000000 |
goseq(pwfs$`fAI-like/+`, gene2cat = c(GO)) %>%
as_tibble() %>%
mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr")) %>%
dplyr::select(-under_represented_pvalue) %>%
head(10) %>%
kable(caption = "Top 10 overrep GO terms in fAI-like/+ brains") %>%
kable_styling()
| category | over_represented_pvalue | numDEInCat | numInCat | FDR |
|---|---|---|---|---|
| GO_GAMMA_SECRETASE_COMPLEX | 0.0009466 | 1 | 4 | 1 |
| GO_NOTCH_RECEPTOR_PROCESSING_LIGAND_DEPENDENT | 0.0011819 | 1 | 5 | 1 |
| GO_ASPARTIC_ENDOPEPTIDASE_ACTIVITY_INTRAMEMBRANE_CLEAVING | 0.0011832 | 1 | 5 | 1 |
| GO_CATION_CHANNEL_ACTIVITY | 0.0013874 | 2 | 276 | 1 |
| GO_REGULATION_OF_L_GLUTAMATE_IMPORT_ACROSS_PLASMA_MEMBRANE | 0.0014149 | 1 | 6 | 1 |
| GO_CHOLINE_TRANSPORT | 0.0014174 | 1 | 6 | 1 |
| GO_REGULATION_OF_AMYLOID_FIBRIL_FORMATION | 0.0016558 | 1 | 7 | 1 |
| GO_MODULATION_OF_AGE_RELATED_BEHAVIORAL_DECLINE | 0.0016559 | 1 | 7 | 1 |
| GO_REGULATION_OF_RESTING_MEMBRANE_POTENTIAL | 0.0018891 | 1 | 8 | 1 |
| GO_REGULATION_OF_CORE_PROMOTER_BINDING | 0.0018891 | 1 | 8 | 1 |
Since no GO terms were found to be overepresented in the DE lists, I used the ranked list approach I have used previously.
fryRes1 <- design %>% colnames() %>% .[2:3] %>%
sapply(function(y) {
cpm(x$counts, log = TRUE) %>%
fry(
index = c(KEGG, ireGenes),
design = design,
contrast = y,
sort = "mixed"
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = y)
}, simplify = FALSE)
cameraRes1 <- design %>% colnames() %>% .[2:3] %>%
sapply(function(y) {
x %>%
cpm(log = TRUE) %>%
camera(
index = c(KEGG, ireGenes),
design = design,
contrast = y,
inter.gene.cor = NA,
sort = TRUE
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = y,
sig = FDR < 0.05)
}, simplify = FALSE)
ranks_fgsea1 <-
sapply(toptable_1, function(x) {
x %>%
mutate(PValue_withsign = sign(logFC) * log10(1/PValue)) %>%
arrange(PValue_withsign) %>%
dplyr::select(c("gene_id", "PValue_withsign")) %>% #only want the Pvalue with sign
with(structure(PValue_withsign, names = gene_id)) %>%
rev() # reverse so the start of the list is upregulated genes
}, simplify = FALSE)
set.seed(2)
fgseaRes1 <- ranks_fgsea1 %>%
lapply(function(x){
fgseaMultilevel(stats = x, pathways = c(KEGG, ireGenes)) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::select(pathway, pval, FDR, padj, everything()) %>%
arrange(pval) %>%
mutate(sig = padj < 0.05)
})
fgseaRes1$`EOfAD-like/+` %<>%
mutate(coef = "EOfAD-like/+" )
fgseaRes1$`fAI-like/+` %<>%
mutate(coef = 'fAI-like/+')
HMP_1 <- fryRes1 %>%
bind_rows() %>%
dplyr::select(pathway, PValue.Mixed, coef) %>%
dplyr::rename(fry_p = PValue.Mixed) %>%
left_join(cameraRes1 %>%
bind_rows() %>%
dplyr::select(pathway, PValue, coef),
by = c("pathway", "coef")) %>%
dplyr::rename(camera_p = PValue) %>%
left_join(fgseaRes1 %>%
bind_rows() %>%
dplyr::select(pathway, pval, coef),
by = c("pathway", "coef")) %>%
dplyr::rename(fgsea_p = pval) %>%
bind_rows() %>%
nest(p = one_of(c("fry_p", "camera_p", "fgsea_p"))) %>%
mutate(harmonic_p = vapply(p, function(x){
x <- unlist(x)
x <- x[!is.na(x)]
p.hmp(x, L = 4)
}, numeric(1))
) %>%
unnest() %>%
mutate(harmonic_p_FDR = p.adjust(harmonic_p, "fdr"),
sig = harmonic_p_FDR < 0.05) %>%
arrange(harmonic_p)
sigPaths <-
HMP_1 %>%
dplyr::filter(harmonic_p_FDR < 0.05) %>%
dplyr::filter(grepl(pathway, pattern = "KEGG")) %>%
.$pathway %>%
unique
sig_in <-
HMP_1 %>%
dplyr::select(pathway, sig, coef) %>%
spread(key = 'coef', value = 'sig') %>%
mutate(sig_in = case_when(
`EOfAD-like/+` == "TRUE" & `fAI-like/+` == "TRUE" ~ 'both',
`EOfAD-like/+` == FALSE & `fAI-like/+` == "TRUE" ~ 'fHS only',
`EOfAD-like/+` == "TRUE" & `fAI-like/+` == FALSE ~ 'EOfAD only',
TRUE ~ 'neither'
)) %>%
dplyr::select(pathway, sig_in)
HMP_1 %>%
dplyr::filter(pathway %in% sigPaths) %>%
left_join(sig_in) %>%
ggplot(aes(y = coef, x = pathway)) +
geom_tile(aes(fill = -log10(harmonic_p)), colour = 'black'
) +
geom_text(aes(label = signif(harmonic_p_FDR, digits = 2)),
colour = "white") +
geom_text(aes(label = signif(harmonic_p_FDR, digits = 2)),
data = . %>%
dplyr::filter(pathway == "KEGG_NOTCH_SIGNALING_PATHWAY" &
coef == "fAI-like/+"),
colour = "black") +
theme_bw() +
easy_rotate_x_labels(angle = -90) +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
strip.background = element_blank(),
strip.text = element_blank()) +
labs(x = "", y = "") +
facet_grid(cols = vars(sig_in), scales = 'free', shrink = FALSE, space = 'free') +
scale_fill_viridis_c(option = 'viridis')
# upset leading edeg EOfAD
fgseaRes1$`EOfAD-like/+` %>%
dplyr::filter(pathway %in%c(HMP_1 %>%
dplyr::filter(coef == "EOfAD-like/+" & sig == TRUE) %>%
.$pathway)
) %>%
dplyr::select(pathway, leadingEdge) %>%
unnest(cols = c(leadingEdge)) %>%
split(f = .$pathway) %>%
lapply(magrittr::extract2, "leadingEdge") %>%
UpSetR::fromList() %>%
UpSetR::upset(order.by = "freq", nsets = length(.))
# upset leading edeg fHS
fgseaRes1$`fAI-like/+` %>%
dplyr::filter(pathway %in%c(HMP_1 %>%
dplyr::filter(coef == "fAI-like/+" & sig == TRUE) %>%
.$pathway)
) %>%
# shorten this name for vis purposes
mutate(pathway = str_replace(pathway, pattern = "KEGG_ARRHYTHMOGENIC_RIGHT_VENTRICULAR_CARDIOMYOPATHY_ARVC",
replacement = "KEGG_ARRHYTHMOGENIC_RIGHT_VENT...")) %>%
dplyr::filter(grepl(pathway, pattern = "KEGG")) %>%
dplyr::select(pathway, leadingEdge) %>%
unnest(cols = c(leadingEdge)) %>%
split(f = .$pathway) %>%
lapply(magrittr::extract2, "leadingEdge") %>%
UpSetR::fromList() %>%
UpSetR::upset(order.by = "freq", nsets = length(.) )
#png("plots/cyto_leadingEdgePheatmap.png", width = 35, height = 5, units = "cm", res = 800)
fgseaRes1 %>%
bind_rows() %>%
dplyr::filter(pathway == "KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION") %>%
dplyr::select(leadingEdge, coef) %>%
set_colnames(c("gene_id", "coef")) %>%
unnest(cols = c(gene_id)) %>%
left_join(toptable_1 %>% bind_rows(), by = c("coef", "gene_id")) %>%
dplyr::select(gene_name, coef, logFC) %>%
dplyr::distinct(gene_name, coef, .keep_all = TRUE) %>%
spread(key = "coef", value = "logFC") %>%
column_to_rownames("gene_name") %>%
pheatmap(cluster_cols = F,cluster_rows = F,
na_col = "grey80",
cellwidth = 12, cellheight = 12,
color = colorRampPalette(brewer.pal(n = 7, name = "YlOrRd"))(100),
)
# dev.off()
#
# png("plots/jakstat_leadingEdgePheatmap.png", width = 35, height = 5, units = "cm", res = 800)
fgseaRes1 %>%
bind_rows() %>%
dplyr::filter(pathway == "KEGG_JAK_STAT_SIGNALING_PATHWAY") %>%
dplyr::select(leadingEdge, coef) %>%
set_colnames(c("gene_id", "coef")) %>%
unnest(cols = c(gene_id)) %>%
left_join(toptable_1 %>% bind_rows(), by = c("coef", "gene_id")) %>%
dplyr::select(gene_name, coef, logFC) %>%
dplyr::distinct(gene_name, coef, .keep_all = TRUE) %>%
spread(key = "coef", value = "logFC") %>%
column_to_rownames("gene_name") %>%
pheatmap(cluster_cols = F,cluster_rows = F,
na_col = "grey80",
cellwidth = 12, cellheight = 12,
color = colorRampPalette(brewer.pal(n = 7, name = "YlOrRd"))(100),
)
#dev.off()
#png("plots/ribo_leadingEdgePheatmap.png", width = 35, height = 5, units = "cm", res = 800)
fgseaRes1 %>%
bind_rows() %>%
dplyr::filter(pathway == "KEGG_RIBOSOME") %>%
dplyr::select(leadingEdge, coef) %>%
set_colnames(c("gene_id", "coef")) %>%
unnest(cols = c(gene_id)) %>%
left_join(toptable_1 %>% bind_rows(), by = c("coef", "gene_id")) %>%
dplyr::select(gene_name, coef, logFC) %>%
dplyr::distinct(gene_name, coef, .keep_all = TRUE) %>%
spread(key = "coef", value = "logFC") %>%
column_to_rownames("gene_name") %>%
pheatmap(cluster_cols = F,cluster_rows = F,
na_col = "grey80",
cellwidth = 12, cellheight = 12,
color = colorRampPalette(rev(brewer.pal(n = 7, name = "YlGnBu")))(100),
)
#dev.off()
toptable_1 %>%
bind_rows() %>%
dplyr::filter(gene_id %in% KEGG$KEGG_RIBOSOME) %>%
dplyr::select(gene_id, coef, logFC) %>%
spread(key = "coef", value = "logFC") %>%
column_to_rownames("gene_id") %>%
pheatmap(colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
show_rownames = FALSE,
treeheight_row = 0, treeheight_col = 0,
breaks = c(seq(min(.), 0, length.out=ceiling(100/2) + 1),
seq(max(.)/100, max(.), length.out=floor(100/2)))
)
# toptable_Q96 %>%
# dplyr::filter(ensembl_gene_id %in% KEGG$KEGG_RIBOSOME) %>%
# dplyr::select(ensembl_gene_id, logFC) %>%
# column_to_rownames("ensembl_gene_id") %>%
# pheatmap(colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
# show_rownames = FALSE, cluster_cols = F,
# treeheight_row = 0, treeheight_col = 0,
# breaks = c(seq(min(.), 0, length.out=ceiling(100/2) + 1),
# seq(max(.)/100, max(.), length.out=floor(100/2)))
# )
annocol_2 <- KEGG$KEGG_PATHWAYS_IN_CANCER %>%
as.data.frame() %>%
set_colnames("gene_id") %>%
mutate(`EOfAD-like/+` = case_when(
gene_id %in% c(fgseaRes1$`EOfAD-like/+` %>%
dplyr::filter(pathway == "KEGG_PATHWAYS_IN_CANCER") %>%
.$leadingEdge %>% unlist) ~ "TRUE"),
`fAI-like/+` = case_when(
gene_id %in% c(fgseaRes1$`fAI-like/+` %>% dplyr::filter(pathway == "KEGG_PATHWAYS_IN_CANCER") %>% .$leadingEdge %>% unlist) ~ "TRUE"
)
)%>%
replace_na(list(`EOfAD-like/+` = FALSE, `fAI-like/+` = FALSE)) %>%
inner_join(x$genes) %>%
dplyr::distinct(gene_name, .keep_all = TRUE) %>%
dplyr::select(gene_name, `EOfAD-like/+`, `fAI-like/+`) %>%
column_to_rownames("gene_name")
#png("plots/KEGG_PATHWAYS_IN_CANCER.png", width = 35, height = 15, units = "cm", res = 600)
toptable_1 %>%
bind_rows() %>%
dplyr::filter(gene_id %in% KEGG$KEGG_PATHWAYS_IN_CANCER) %>%
dplyr::select(gene_name, coef, logFC) %>%
dplyr::distinct(gene_name, coef, .keep_all = TRUE) %>%
spread(key = "coef", value = "logFC") %>%
column_to_rownames("gene_name") %>%
t() %>%
pheatmap(
#cellwidth = 10,
cellheight = 10,
show_colnames = F,
treeheight_row = 0, treeheight_col = 80,
breaks = seq(-max(abs(.)), max(abs(.)), length.out = 100),
color = colorRampPalette(rev(brewer.pal(n = 7,
name = "RdBu")))(100),
main = "KEGG_PATHWAYS_IN_CANCER" ,
annotation_col = annocol_2,
width = 35,
annotation_colors = list(
`EOfAD-like/+` = c("FALSE" = "grey70", "TRUE" = "#6bff97"),
`fAI-like/+` = c("FALSE" = "grey70", "TRUE" = "#6bff97"))
)
#dev.off()
fryRes_nopsen1 <- design %>% colnames() %>% .[2:3] %>%
sapply(function(y) {
cpm(x$counts, log = TRUE)[!grepl(rownames(cpm(x$counts)), pattern = "ENSDARG00000004870"),] %>% fry(
index = c(KEGG, ireGenes),
design = design,
contrast = y,
sort = "mixed"
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = y)
}, simplify = FALSE)
cameraRes_nopsen1 <- design %>% colnames() %>% .[2:3] %>%
sapply(function(y) {
cpm(x$counts, log = TRUE)[!grepl(rownames(cpm(x$counts)), pattern = "ENSDARG00000004870"),] %>%
camera(
index = c(KEGG, ireGenes),
design = design,
contrast = y,
inter.gene.cor = NA,
sort = TRUE
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = y,
sig = FDR < 0.05)
}, simplify = FALSE)
ranks_fgsea_nopsen1 <-
sapply(toptable_1, function(x) {
x %>%
dplyr::filter(gene_id != "ENSDARG00000004870") %>%
mutate(PValue_withsign = sign(logFC) * log10(1/PValue)) %>%
arrange(PValue_withsign) %>%
dplyr::select(c("gene_id", "PValue_withsign")) %>% #only want the Pvalue with sign
with(structure(PValue_withsign, names = gene_id)) %>%
rev() # reverse so the start of the list is upregulated genes
}, simplify = FALSE)
set.seed(1)
fgseaRes_nopsen1 <- ranks_fgsea_nopsen1 %>%
lapply(function(y){
fgseaMultilevel(stats = y, pathways = c(KEGG, ireGenes), eps = 0) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::select(pathway, pval, FDR, padj, everything()) %>%
arrange(pval) %>%
mutate(sig = padj < 0.05)
})
fgseaRes_nopsen1$`EOfAD-like/+` %<>%
mutate(coef = "EOfAD-like/+" )
fgseaRes_nopsen1$`fAI-like/+` %<>%
mutate(coef = 'fAI-like/+')
HMP_nopsen1 <- fryRes_nopsen1 %>%
bind_rows() %>%
dplyr::select(pathway, PValue.Mixed, coef) %>%
dplyr::rename(fry_p = PValue.Mixed) %>%
left_join(cameraRes_nopsen1 %>%
bind_rows() %>%
dplyr::select(pathway, PValue, coef),
by = c("pathway", "coef")) %>%
dplyr::rename(camera_p = PValue) %>%
left_join(fgseaRes_nopsen1 %>%
bind_rows() %>%
dplyr::select(pathway, pval, coef),
by = c("pathway", "coef")) %>%
dplyr::rename(fgsea_p = pval) %>%
bind_rows() %>%
nest(p = one_of(c("fry_p", "camera_p", "fgsea_p"))) %>%
mutate(harmonic_p = vapply(p, function(x){
x <- unlist(x)
x <- x[!is.na(x)]
p.hmp(x, L = 4)
}, numeric(1))
) %>%
unnest() %>%
mutate(harmonic_p_FDR = p.adjust(harmonic_p, "fdr"),
sig = harmonic_p_FDR < 0.05) %>%
arrange(harmonic_p)
HMP_nopsen1 %>%
dplyr::filter(harmonic_p_FDR < 0.05) %>%
ggplot(aes(x = coef, y = pathway)) +
geom_point(size = 4) +
geom_label(aes(label = signif(harmonic_p_FDR, digits = 2)))
notchTargets <-
msigdbr("Danio rerio", category = "C6") %>%
dplyr::filter(gs_name %in% c("NOTCH_DN.V1_UP", "NOTCH_DN.V1_DN","NGUYEN_NOTCH1_TARGETS_UP", "NGUYEN_NOTCH1_TARGETS_DN")) %>%
rbind(msigdbr("Danio rerio", category = "C2") %>%
dplyr::filter(gs_name %in% c("RYAN_MANTLE_CELL_LYMPHOMA_NOTCH_DIRECT_UP"))
) %>%
rbind(msigdbr("Danio rerio", category = "C2") %>%
dplyr::filter(gs_name %in% c("NGUYEN_NOTCH1_TARGETS_UP", "NGUYEN_NOTCH1_TARGETS_DN"))
) %>%
left_join(x$genes, by = c("gene_symbol" = "gene_name")) %>%
dplyr::filter(!is.na(gene_id)) %>%
distinct(gs_name, gene_id, .keep_all = TRUE) %>%
split(f = .$gs_name) %>%
lapply(extract2, "gene_id")
fryRes_notch <- design %>% colnames() %>% .[2:3] %>%
sapply(function(y) {
cpm(x$counts, log = TRUE) %>%
fry(
index = notchTargets,
design = design,
contrast = y,
sort = "mixed"
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = y)
}, simplify = FALSE)
cameraRes_notch <- design %>% colnames() %>% .[2:3] %>%
sapply(function(y) {
x %>%
cpm(log = TRUE) %>%
camera(
index = notchTargets,
design = design,
contrast = y,
inter.gene.cor = NA,
sort = TRUE
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = y,
#sig = FDR < 0.05
)
}, simplify = FALSE)
set.seed(33)
fgseaRes_notch <- ranks_fgsea1 %>%
lapply(function(x){
fgseaMultilevel(stats = x, pathways = notchTargets) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::select(pathway, pval, FDR, padj, everything()) %>%
arrange(pval) %>%
mutate(sig = padj < 0.05)
})
fgseaRes_notch$`EOfAD-like/+` %<>%
mutate(coef = "EOfAD-like/+" )
fgseaRes_notch$`fAI-like/+` %<>%
mutate(coef = 'fAI-like/+')
HMP_notch <- fryRes_notch %>%
bind_rows() %>%
dplyr::select(pathway, PValue, coef) %>%
dplyr::rename(fry_p = PValue) %>%
left_join(cameraRes_notch %>%
bind_rows() %>%
dplyr::select(pathway, PValue, coef),
by = c("pathway", "coef")) %>%
dplyr::rename(camera_p = PValue) %>%
left_join(fgseaRes_notch %>%
bind_rows() %>%
dplyr::select(pathway, pval, coef),
by = c("pathway", "coef")) %>%
dplyr::rename(fgsea_p = pval) %>%
bind_rows() %>%
nest(p = one_of(c("fry_p", "camera_p", "fgsea_p"))) %>%
mutate(harmonic_p = vapply(p, function(x){
x <- unlist(x)
x <- x[!is.na(x)]
p.hmp(x, L = 4)
}, numeric(1))
) %>%
unnest() %>%
mutate(harmonic_p_FDR = p.adjust(harmonic_p, "fdr"),
sig = harmonic_p_FDR < 0.05) %>%
arrange(harmonic_p)
HMP_notch
## # A tibble: 10 x 8
## pathway coef fry_p camera_p fgsea_p harmonic_p harmonic_p_FDR sig
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
## 1 RYAN_MANTLE_… EOfAD… 0.00968 0.00810 2.23e-4 0.000642 0.00642 TRUE
## 2 RYAN_MANTLE_… fAI-l… 0.0304 0.0491 5.10e-4 0.00151 0.00756 TRUE
## 3 NOTCH_DN.V1_… EOfAD… 0.0659 0.208 4.80e-2 0.105 0.349 FALSE
## 4 NGUYEN_NOTCH… fAI-l… 0.148 0.487 6.23e-1 0.435 0.612 FALSE
## 5 NOTCH_DN.V1_… EOfAD… 0.163 0.942 5.38e-1 0.475 0.612 FALSE
## 6 NOTCH_DN.V1_… fAI-l… 0.166 0.712 8.10e-1 0.488 0.612 FALSE
## 7 NOTCH_DN.V1_… fAI-l… 0.228 0.655 6.60e-1 0.528 0.612 FALSE
## 8 NGUYEN_NOTCH… EOfAD… 0.297 0.660 9.16e-1 0.578 0.612 FALSE
## 9 NGUYEN_NOTCH… fAI-l… 0.321 0.788 7.27e-1 0.585 0.612 FALSE
## 10 NGUYEN_NOTCH… EOfAD… 0.407 0.860 7.53e-1 0.612 0.612 FALSE
#
# annocol <- notchTargets$RYAN_MANTLE_CELL_LYMPHOMA_NOTCH_DIRECT_UP %>%
# as.data.frame() %>%
# set_colnames("gene_id") %>%
# mutate(`EOfAD-like/+` = case_when(
# gene_id %in% fgseaRes_notch$`EOfAD-like/+`$leadingEdge[[1]] ~ "TRUE"),
# `fAI-like/+` = case_when(
# gene_id %in% fgseaRes_notch$`fAI-like/+`$leadingEdge[[1]] ~ "TRUE"
# )
# ) %>%
# replace_na(list(`EOfAD-like/+` = FALSE, `fAI-like/+` = FALSE)) %>%
# inner_join(x$genes) %>%
# dplyr::distinct(gene_name, .keep_all = TRUE) %>%
# dplyr::select(gene_name, `EOfAD-like/+`, `fAI-like/+`) %>%
# column_to_rownames("gene_name")
#
# png("plots/ryanetalPheatmap.png", width = 60, height = 20, units = "cm", res = 600)
# toptable_1 %>%
# bind_rows() %>%
# dplyr::filter(gene_id %in% notchTargets$RYAN_MANTLE_CELL_LYMPHOMA_NOTCH_DIRECT_UP) %>%
# dplyr::select(gene_name, coef, logFC) %>%
# dplyr::distinct(gene_name, coef, .keep_all = TRUE) %>%
# spread(key = "coef", value = "logFC") %>%
# column_to_rownames("gene_name") %>%
# t() %>%
# pheatmap(cellwidth = 10, cellheight = 10,
# treeheight_row = 0, treeheight_col = 80,
# breaks = seq(-max(abs(.)), max(abs(.)), length.out = 100),
# color = colorRampPalette(rev(brewer.pal(n = 7,
# name = "RdBu")))(100),
# main = "RYAN_MANTLE_CELL_LYMPHOMA_NOTCH_DIRECT_UP" ,
# annotation_col = annocol,
# annotation_colors = list(
# `EOfAD-like/+` = c("FALSE" = "grey70", "TRUE" = "#6bff97"),
# `fAI-like/+` = c("FALSE" = "grey70", "TRUE" = "#6bff97"))
# )
# dev.off()
#
# fgseaRes_notch %>%
# lapply(function(y) {
# y %>%
# dplyr::filter(pathway == "RYAN_MANTLE_CELL_LYMPHOMA_NOTCH_DIRECT_UP") %>%
# dplyr::select(leadingEdge)
# }) %>%
# unlist %>%
# as.data.frame() %>%
# rownames_to_column("coef") %>%
# set_colnames(c("coef", "gene_id")) %>%
# mutate(coef = str_remove(coef, pattern = ".leadingEdge[0-9]+")) %>%
# split(f = .$coef) %>%
# lapply(magrittr::extract2, "gene_id") %>%
# UpSetR::fromList() %>%
# UpSetR::upset(order.by = "freq")
#
# toptable_1 %>%
# bind_rows() %>%
# dplyr::filter(gene_id %in% intersect(fgseaRes_notch$`EOfAD-like/+`[1,]$leadingEdge[[1]],
# fgseaRes_notch$`fHS-like/+`[1,]$leadingEdge[[1]])) %>%
# dplyr::select(gene_name, coef, logFC) %>%
# dplyr::distinct(gene_name, coef, .keep_all = TRUE) %>%
# spread(key = "coef", value = "logFC") %>%
# column_to_rownames("gene_name") %>%
# pheatmap(cluster_cols = FALSE, clustering_method = 'average',
# breaks = seq(-max(abs(.)), max(abs(.)), length.out = 100),
# color = colorRampPalette(rev(brewer.pal(n = 7,
# name = "RdBu")))(100),
# main = "Ryan et al. leading edge shared genes")
#
#
# toptable_1 %>%
# bind_rows() %>%
# dplyr::filter(gene_id %in% fgseaRes_notch$`EOfAD-like/+`[1,]$leadingEdge[[1]])%>%
# dplyr::select(gene_name, coef, logFC) %>%
# dplyr::distinct(gene_name, coef, .keep_all = TRUE) %>%
# spread(key = "coef", value = "logFC") %>%
# column_to_rownames("gene_name") %>%
# pheatmap(cluster_cols = FALSE, clustering_method = 'average',
# breaks = seq(-max(abs(.)), max(abs(.)), length.out = 100),
# color = colorRampPalette(rev(brewer.pal(n = 7,
# name = "RdBu")))(100),
# main = "Ryan et al. leading edge shared genes")
#
# toptable_1 %>%
# bind_rows() %>%
# dplyr::filter(gene_id %in% fgseaRes_notch$`fHS-like/+`[1,]$leadingEdge[[1]])%>%
# dplyr::select(gene_name, coef, logFC) %>%
# dplyr::distinct(gene_name, coef, .keep_all = TRUE) %>%
# spread(key = "coef", value = "logFC") %>%
# column_to_rownames("gene_name") %>%
# pheatmap(cluster_cols = FALSE, clustering_method = 'average',
# breaks = seq(-max(abs(.)), max(abs(.)), length.out = 100),
# color = colorRampPalette(rev(brewer.pal(n = 7,
# name = "RdBu")))(100),
# main = "Ryan et al. leading edge shared genes")
design %>% colnames() %>% .[2:3] %>%
sapply(function(y) {
cpm(x$counts, log = TRUE) %>%
mroast(
index = notchTargets,
design = design,
contrast = y,
sort = "mixed"
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = y)
}, simplify = FALSE) %>%
bind_rows() %>%
mutate(Upregulated = NGenes*PropUp,
Downregulated = NGenes*PropDown,
ns = NGenes - Upregulated - Downregulated) %>%
left_join(HMP_notch) %>%
dplyr::select(pathway, Upregulated, Downregulated, ns, coef, harmonic_p, sig) %>%
gather(key = "Direction", value = "NGenes", c("Upregulated", "Downregulated", "ns")) %>%
mutate(Direction = factor(Direction, levels = c("ns", "Downregulated", "Upregulated"))) %>%
ggplot(aes(x = pathway)) +
geom_col(aes(y = NGenes, fill = Direction, alpha = sig)) +
# geom_text(aes(label = signif(harmonic_p, 2), y = NGenes-10),
# data = . %>%
# dplyr::distinct(coef, pathway, .keep_all = TRUE)
# ) +
coord_flip() +
scale_alpha_manual(values = c(0.5, 1)) +
scale_fill_manual(values = c("grey50", "#2166AC", "#B2182B")) +
scale_color_manual(values = c("NA", "black")) +
theme(legend.position = "bottom") +
facet_wrap(~coef) +
labs(y = "Number of genes",
x = "Notch gene set",
fill = "Direction" ,
alpha = "FDR-adjusted\nHMP< 0.05") +
theme_bw()
#ggsave("plots/notchTargets.png", width = 20, height = 10, units = "cm", dpi = 600, scale = 1.3)
psen2_gene_id <- "ENSDARG00000015540"
cpm(x, log = TRUE) %>%
as.data.frame() %>%
.[psen2_gene_id,] %>%
gather(key = "sample", value = "logCPM") %>%
left_join(x$samples) %>%
ggplot(aes(x = Genotype, y = logCPM, fill = Genotype)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(aes(shape = Sex),
size = 3, colour = "black") +
theme_bw() +
theme(aspect.ratio = 1) +
easy_rotate_x_labels(angle = -45) +
labs(x = "Genotype",
fill = "Genotype") +
stat_compare_means(comparisons = list(c("WT", "EOfAD-like/+"),
c("WT", "fAI-like/+")))
#ggsave("plots/psen2expression.png", width = 10, height = 10, units = "cm", dpi = 600, scale = 1.5)
# This code loads the function in the working environment
source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")
cell_type_markers <- read_xls("gene_sets/cell_type_genes41586_2007_BFnature05453_MOESM11_ESM.xls") %>%
as_tibble() %>%
dplyr::select(c("Neuron-enriched", "Astrocyte-enriched", "Oligodendrocyte-enriched")) %>%
gather(key = "cell_type", value = "gene_name")
# these are in mouse, so want to convert to zebrafish enselbl id.
# went and downloaded this from biomart website
mm2zfID <- read_tsv("gene_sets/mouse_geneName_to_zf_ID.txt") %>%
as_tibble() %>%
set_colnames(c("m_geneID", "gene_name", "gene_id"))
cell_type_markers <- left_join(cell_type_markers, mm2zfID) %>%
na.omit() %>%
dplyr::select(c("cell_type", "gene_id"))
# microglia
microglia <- read_excel("gene_sets/microglia_oosterofetal.xlsx", sheet = 1) %>%
rbind(read_excel("gene_sets/microglia_oosterofetal.xlsx", sheet = 2)) %>%
rbind(read_excel("gene_sets/microglia_oosterofetal.xlsx", sheet = 3)) %>%
.$Zebrafish.Ensembl.Gene.ID %>%
unique
ggarrange(
#Neurons
x %>%
cpm(log=TRUE) %>%
as.data.frame() %>%
rownames_to_column("gene_id") %>%
dplyr::filter(gene_id %in% c(cell_type_markers %>%
dplyr::filter(cell_type == "Neuron-enriched") %>%
.$gene_id)
) %>%
gather(key = "sample", value = "logCPM", colnames(x)) %>%
left_join(x$samples) %>%
ggplot(aes(x = Genotype, y = logCPM)) +
geom_flat_violin(
aes(fill = Genotype),
position = position_nudge(x = 0.2, y = 0),
alpha = 0.75,
trim = FALSE) +
geom_point(
aes(colour = Genotype),
position = position_jitter(width = 0.15), size = 1, alpha = 0.1 ) +
geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.8) +
theme_bw() +
labs(x = "",
fill = "Genotype") +
ggtitle("Neurons") +
coord_flip() +
stat_compare_means(method = 'anova',label = "p.signif", label.y = 15, label.x = "EOfAD-like/+")
,
# Oligodendrocytes
x %>%
cpm(log=TRUE) %>%
as.data.frame() %>%
rownames_to_column("gene_id") %>%
dplyr::filter(gene_id %in% c(cell_type_markers %>%
dplyr::filter(cell_type == "Oligodendrocyte-enriched") %>%
.$gene_id)
) %>%
gather(key = "sample", value = "logCPM", colnames(x)) %>%
left_join(x$samples) %>%
ggplot(aes(x = Genotype, y = logCPM)) +
geom_flat_violin(
aes(fill = Genotype),
position = position_nudge(x = 0.2, y = 0),
alpha = 0.75,
trim = FALSE) +
geom_point(
aes(colour = Genotype),
position = position_jitter(width = 0.15), size = 1, alpha = 0.1 ) +
geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.8) +
theme_bw() +
labs(x = "",
fill = "Genotype") +
ggtitle("Oligodendrocytes") +
stat_compare_means(method = 'anova',label = "p.signif", label.y = 15, label.x = "EOfAD-like/+") +
coord_flip(),
#Astrocytes
x %>%
cpm(log=TRUE) %>%
as.data.frame() %>%
rownames_to_column("gene_id") %>%
dplyr::filter(gene_id %in% c(cell_type_markers %>%
dplyr::filter(cell_type == "Astrocyte-enriched") %>%
.$gene_id)
) %>%
gather(key = "sample", value = "logCPM", colnames(x)) %>%
left_join(x$samples) %>%
ggplot(aes(x = Genotype, y = logCPM)) +
geom_flat_violin(
aes(fill = Genotype),
position = position_nudge(x = 0.2, y = 0),
alpha = 0.75,
trim = FALSE) +
geom_point(
aes(colour = Genotype),
position = position_jitter(width = 0.15), size = 1, alpha = 0.25 ) +
geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.8) +
theme_bw() +
labs(x = "",
fill = "Genotype") +
ggtitle("Astrocytes") +
coord_flip() +
stat_compare_means(method = 'anova', label = "p.signif", label.y = 15, label.x = "EOfAD-like/+"),
# microglia
x %>%
cpm(log=TRUE) %>%
as.data.frame() %>%
rownames_to_column("gene_id") %>%
dplyr::filter(gene_id %in% microglia) %>%
gather(key = "sample", value = "logCPM", colnames(x)) %>%
left_join(x$samples) %>%
ggplot(aes(x = Genotype, y = logCPM)) +
geom_flat_violin(
aes(fill = Genotype),
position = position_nudge(x = 0.2, y = 0),
alpha = 0.75,
trim = FALSE) +
geom_point(
aes(colour = Genotype),
position = position_jitter(width = 0.15), size = 1, alpha = 0.25 ) +
geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.8) +
theme_bw() +
labs(x = "",
fill = "Genotype") +
coord_flip() +
stat_compare_means(method = 'anova', label = "p.signif", label.y = 15, label.x = "EOfAD-like/+") +
ggtitle("Microglia"),
legend = "none",
labels = "AUTO"
) +
ggsave("plots/cellTypeCheck.tiff", width = 22, height = 18, units = "cm", dpi = 600)
cell_type_markers %<>% split(f = .$cell_type) %>% lapply(extract2, "gene_id")
cell_type_markers$Microglia <- microglia
design %>% colnames() %>% .[2:3] %>%
sapply(function(y) {
cpm(x$counts, log = TRUE) %>%
fry(
index = cell_type_markers,
design = design,
contrast = y,
sort = "mixed"
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = y)
}, simplify = FALSE)
## $`EOfAD-like/+`
## # A tibble: 4 x 8
## pathway NGenes Direction PValue FDR PValue.Mixed FDR.Mixed coef
## <chr> <int> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 Microglia 432 Up 0.0201 0.0802 0.148 0.594 EOfAD-l…
## 2 Astrocyte-enri… 40 Up 0.220 0.440 0.415 0.636 EOfAD-l…
## 3 Oligodendrocyt… 86 Up 0.694 0.694 0.489 0.636 EOfAD-l…
## 4 Neuron-enriched 71 Down 0.666 0.694 0.636 0.636 EOfAD-l…
##
## $`fAI-like/+`
## # A tibble: 4 x 8
## pathway NGenes Direction PValue FDR PValue.Mixed FDR.Mixed coef
## <chr> <int> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 Microglia 432 Up 0.0745 0.252 0.196 0.317 fAI-li…
## 2 Astrocyte-enrich… 40 Up 0.126 0.252 0.221 0.317 fAI-li…
## 3 Neuron-enriched 71 Down 0.799 0.799 0.313 0.317 fAI-li…
## 4 Oligodendrocyte-… 86 Up 0.471 0.628 0.317 0.317 fAI-li…
comp_w_Q96 <- read_rds("~/Documents/EOfAD_signature/R_objects/psen1Q96_HMP.rds") %>%
mutate(coef = "Q96_K97del/+") %>%
bind_rows(HMP_1 %>%
dplyr::select(pathway, coef, harmonic_p, harmonic_p_FDR))
sig_all <- comp_w_Q96 %>%
dplyr::filter(harmonic_p_FDR < 0.05) %>%
.$pathway %>%
unique
sig_in_all <- comp_w_Q96 %>%
mutate(sig = harmonic_p_FDR < 0.05) %>%
dplyr::select(pathway, sig, coef) %>%
spread(key = 'coef', value = 'sig') %>%
mutate(sig_in = case_when(
`EOfAD-like/+` == TRUE & `fAI-like/+` == TRUE & `Q96_K97del/+` == TRUE ~ 'all',
`EOfAD-like/+` == "FALSE" & `fAI-like/+` == TRUE & `Q96_K97del/+` == "FALSE" ~ 'fHS only',
`EOfAD-like/+` == TRUE & `fAI-like/+` == "FALSE" & `Q96_K97del/+` == "FALSE" ~ "T428del only",
`EOfAD-like/+` == "FALSE" & `fAI-like/+` == "FALSE" & `Q96_K97del/+` == TRUE ~ "Q96 only",
`EOfAD-like/+` == "TRUE" & `fAI-like/+` == "FALSE" & `Q96_K97del/+` == TRUE ~ "b_Q96 and T428 only",
`EOfAD-like/+` == TRUE & `fAI-like/+` == TRUE & `Q96_K97del/+` == "FALSE" ~ "fHS and T428 only",
TRUE ~ 'neither'
)) %>%
dplyr::select(pathway, sig_in)
comp_w_Q96 %>%
dplyr::filter(grepl(pathway, pattern = "KEGG|^ire")) %>%
dplyr::filter(pathway %in% sig_all) %>%
left_join(sig_in_all) %>%
dplyr::filter(sig_in != "fHS only") %>%
mutate(sig = harmonic_p_FDR < 0.05) %>%
ggplot(aes(x = coef, y = pathway)) +
geom_tile(aes(fill = -log10(harmonic_p)
)
) +
geom_tile(aes(alpha = harmonic_p_FDR > 0.05), show.legend = FALSE) +
geom_text(aes(label = signif(harmonic_p_FDR, digits = 2)),
fontface = "bold") +
geom_text(aes(label = signif(harmonic_p_FDR, digits = 2)),
data = . %>% dplyr::filter(harmonic_p_FDR > 0.05),
colour = "white",
fontface = "bold"
) +
theme_bw() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
strip.background = element_blank(),
strip.text = element_blank()
) +
scale_fill_viridis_c(option = 'viridis') +
facet_grid(rows = vars(sig_in), scales = 'free_y', shrink = FALSE, space = 'free_y') +
labs(x = "",
y = "Gene set")
#ggsave("plots/comp_w_Q96.png", width = 10, height = 10, units = "cm", dpi = 800, scale = 2)
saveRDS(x, "R_objects/dge.rds")
saveRDS(toptable_1, "R_objects/toptable_raw.rds")
# make a copy for export
toptable_forExport <- toptable_1
names(toptable_forExport) <- c("EOfADLike", "fHSLike")
writexl::write_xlsx(toptable_forExport, path = "R_objects/toptable_raw.xlsx")
writexl::write_xlsx(HMP_1, path = "R_objects/HMP_raw.xlsx")
saveRDS(HMP_1, "R_objects/HMP_raw_KEGG_IRE.rds")
sessionInfo() %>%
pander()
R version 4.0.2 (2020-06-22)
Platform: x86_64-apple-darwin17.0 (64-bit)
locale: en_AU.UTF-8||en_AU.UTF-8||en_AU.UTF-8||C||en_AU.UTF-8||en_AU.UTF-8
attached base packages: stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: ensembldb(v.2.12.1), AnnotationFilter(v.1.12.0), GenomicFeatures(v.1.40.1), AnnotationDbi(v.1.50.3), Biobase(v.2.48.0), GenomicRanges(v.1.40.0), GenomeInfoDb(v.1.24.2), IRanges(v.2.22.2), S4Vectors(v.0.26.1), kableExtra(v.1.3.1), pander(v.0.6.3), UpSetR(v.1.4.0), RColorBrewer(v.1.1-2), pathview(v.1.28.1), goseq(v.1.40.0), geneLenDataBase(v.1.24.0), BiasedUrn(v.1.07), harmonicmeanp(v.3.0), FMStable(v.0.1-2), fgsea(v.1.14.0), ggeasy(v.0.1.2), pheatmap(v.1.0.12), msigdbr(v.7.2.1), ggrepel(v.0.8.2), cowplot(v.1.1.0), ggpubr(v.0.4.0), scales(v.1.1.1), readxl(v.1.3.1), AnnotationHub(v.2.20.2), BiocFileCache(v.1.12.1), dbplyr(v.2.0.0), ngsReports(v.1.4.2), BiocGenerics(v.0.34.0), ggfortify(v.0.4.11), edgeR(v.3.30.3), limma(v.3.44.3), magrittr(v.2.0.1), forcats(v.0.5.0), stringr(v.1.4.0), dplyr(v.1.0.2), purrr(v.0.3.4), readr(v.1.4.0), tidyr(v.1.1.2), tibble(v.3.0.4), ggplot2(v.3.3.2) and tidyverse(v.1.3.0)
loaded via a namespace (and not attached): utf8(v.1.1.4), tidyselect(v.1.1.0), RSQLite(v.2.2.1), htmlwidgets(v.1.5.2), FactoMineR(v.2.3), grid(v.4.0.2), BiocParallel(v.1.22.0), munsell(v.0.5.0), statmod(v.1.4.35), DT(v.0.16), withr(v.2.3.0), colorspace(v.2.0-0), highr(v.0.8), knitr(v.1.30), rstudioapi(v.0.13), leaps(v.3.1), ggsignif(v.0.6.0), labeling(v.0.4.2), KEGGgraph(v.1.48.0), GenomeInfoDbData(v.1.2.3), hwriter(v.1.3.2), farver(v.2.0.3), bit64(v.4.0.5), rhdf5(v.2.32.4), vctrs(v.0.3.5), generics(v.0.1.0), xfun(v.0.19), R6(v.2.5.0), locfit(v.1.5-9.4), bitops(v.1.0-6), DelayedArray(v.0.14.1), assertthat(v.0.2.1), promises(v.1.1.1), gtable(v.0.3.0), rlang(v.0.4.9), scatterplot3d(v.0.3-41), splines(v.4.0.2), rtracklayer(v.1.48.0), rstatix(v.0.6.0), lazyeval(v.0.2.2), broom(v.0.7.2), BiocManager(v.1.30.10), yaml(v.2.2.1), reshape2(v.1.4.4), abind(v.1.4-5), modelr(v.0.1.8), backports(v.1.2.0), httpuv(v.1.5.4), tools(v.4.0.2), ellipsis(v.0.3.1), ggdendro(v.0.1.22), Rcpp(v.1.0.5), plyr(v.1.8.6), progress(v.1.2.2), zlibbioc(v.1.34.0), RCurl(v.1.98-1.2), prettyunits(v.1.1.1), openssl(v.1.4.3), zoo(v.1.8-8), SummarizedExperiment(v.1.18.2), haven(v.2.3.1), cluster(v.2.1.0), fs(v.1.5.0), data.table(v.1.13.2), openxlsx(v.4.2.3), reprex(v.0.3.0), truncnorm(v.1.0-8), ProtGenerics(v.1.20.0), matrixStats(v.0.57.0), hms(v.0.5.3), mime(v.0.9), evaluate(v.0.14), xtable(v.1.8-4), XML(v.3.99-0.5), rio(v.0.5.16), jpeg(v.0.1-8.1), gridExtra(v.2.3), compiler(v.4.0.2), biomaRt(v.2.44.4), writexl(v.1.3.1), crayon(v.1.3.4), htmltools(v.0.5.0), mgcv(v.1.8-33), later(v.1.1.0.1), lubridate(v.1.7.9.2), DBI(v.1.1.0), MASS(v.7.3-53), rappdirs(v.0.3.1), ShortRead(v.1.46.0), Matrix(v.1.2-18), car(v.3.0-10), cli(v.2.2.0), pkgconfig(v.2.0.3), flashClust(v.1.01-2), GenomicAlignments(v.1.24.0), foreign(v.0.8-80), plotly(v.4.9.2.1), xml2(v.1.3.2), webshot(v.0.5.2), XVector(v.0.28.0), rvest(v.0.3.6), digest(v.0.6.27), graph(v.1.66.0), Biostrings(v.2.56.0), rmarkdown(v.2.5), cellranger(v.1.1.0), fastmatch(v.1.1-0), curl(v.4.3), shiny(v.1.5.0), Rsamtools(v.2.4.0), lifecycle(v.0.2.0), nlme(v.3.1-150), jsonlite(v.1.7.1), Rhdf5lib(v.1.10.1), carData(v.3.0-4), viridisLite(v.0.3.0), askpass(v.1.1), fansi(v.0.4.1), pillar(v.1.4.7), lattice(v.0.20-41), KEGGREST(v.1.28.0), fastmap(v.1.0.1), httr(v.1.4.2), GO.db(v.3.11.4), interactiveDisplayBase(v.1.26.3), glue(v.1.4.2), zip(v.2.1.1), png(v.0.1-7), BiocVersion(v.3.11.1), bit(v.4.0.4), Rgraphviz(v.2.32.0), stringi(v.1.5.3), blob(v.1.2.1), org.Hs.eg.db(v.3.11.4), latticeExtra(v.0.6-29) and memoise(v.1.1.0)